In [1]:
import requests
from lxml import html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from pprint import pprint
import re
%matplotlib inline
In [2]:
url = "https://www.washingtonpost.com/news/the-fix/wp/2016/01/17/the-4th-democratic-debate-transcript-annotated-who-said-what-and-what-it-meant/"
response = requests.get(url)
In [3]:
doc = html.fromstring(response.text)
In [4]:
para_list = doc.xpath("//article/p/text()")
In [5]:
para_list = para_list[2:]
In [6]:
pprint(para_list[:2], compact=True)
print(para_list[-2:])
["HOLT: We'll begin with 45 second opening statements from each candidate, "
'starting with Secretary Clinton. ',
'CLINTON: Well, good evening. And I want to thank the Congressional Black '
'Caucus Institute and the people of Charleston for hosting us here on the '
'eve of Martin Luther King Day tomorrow. ']
['HOLT: All right. Well thank you and thanks to all of you for being here tonight shedding light on some of the differences as Americans get ready to vote. ', "I also want to thank the Congressional Black Caucus Institute and certainly my friend and colleague, Andrea Mitchell. This has been great. It's been a great spirited conversation and American people appreciate it. "]
In [7]:
dataset = pd.DataFrame(para_list, columns=["raw"])
dataset
Out[7]:
raw
0
HOLT: We'll begin with 45 second opening state...
1
CLINTON: Well, good evening. And I want to tha...
2
You know, I remember well when my youth minist...
3
And that is our fight still. We have to get th...
4
I understand that this is the hardest job in t...
5
(APPLAUSE)
6
HOLT: Thank you. Senator Sanders, your opening...
7
SANDERS: Thank you. As we honor the extraordin...
8
SANDERS: And then, to make a bad situation wor...
9
This campaign is about a political revolution ...
10
HOLT: Senator, thank you.
11
(APPLAUSE)
12
And Governor O'Malley, your opening statement,...
13
O'MALLEY: Thank you. My name is Martin O'Malle...
14
And I want to thank the people of South Caroli...
15
You taught us, in fact, in keeping with Dr. Ki...
16
Eight years ago, you brought forward a new lea...
17
But in order to make good on the promise of eq...
18
We need new leadership. We need to come togeth...
19
That's why I'm running for president. I need y...
20
Thank you.
21
HOLT: All right. And Governor, thank you.
22
(APPLAUSE)
23
HOLT: All right, to our first question, now. T...
24
President Obama came to office determined to s...
25
Senator Sanders.
26
SANDERS: Well, that's what our campaign is abo...
27
So, what my first days are about is bringing A...
28
(APPLAUSE)
29
HOLT: Secretary Clinton, same question, my fir...
...
...
592
HOLT: Welcome back everybody. Finally, before ...
593
And, we'll start with Governor O'Malley.
594
(LAUGHTER)
595
HOLT: Didn't see that coming, did you?
596
O'MALLEY: Yes, but we're going to have to get ...
597
(LAUGHTER)
598
MITCHELL: ...too long (ph).
599
O'MALLEY: I believe there are many issues. I h...
600
HOLT: Sixty seconds, we'd appreciate it.
601
O'MALLEY: There are so many issues that we hav...
602
(APPLAUSE)
603
O'MALLEY: We haven't discussed the fact that i...
604
I guess the bottom line is this, look we are a...
605
We're on the threshold of a new era of America...
606
HOLT: And that's time.
607
O'MALLEY: Thanks a lot.
608
HOLT: Secretary Clinton?
609
CLINTON: Well Lester, I spent a lot of time la...
610
He had request for help and he had basically s...
611
So I sent my top campaign aide down there to t...
612
HOLT: And that's time.
613
CLINTON: I want to be a president who takes ca...
614
(APPLAUSE)
615
HOLT: Thank you.
616
Senator Sanders?
617
SANDERS: Well, Secretary Clinton was right and...
618
Now, we are a great nation -- and we've heard ...
619
We've got to get rid of Super PACs, we've got ...
620
HOLT: All right. Well thank you and thanks to ...
621
I also want to thank the Congressional Black C...
622 rows × 1 columns
In [8]:
def get_name(x):
r = re.findall(r"^([A-Z']*):", x)
if r:
return r[0]
else:
return np.NaN
In [9]:
dataset["speaker"] = dataset.raw.apply(get_name).fillna(method='ffill')
dataset
Out[9]:
raw
speaker
0
HOLT: We'll begin with 45 second opening state...
HOLT
1
CLINTON: Well, good evening. And I want to tha...
CLINTON
2
You know, I remember well when my youth minist...
CLINTON
3
And that is our fight still. We have to get th...
CLINTON
4
I understand that this is the hardest job in t...
CLINTON
5
(APPLAUSE)
CLINTON
6
HOLT: Thank you. Senator Sanders, your opening...
HOLT
7
SANDERS: Thank you. As we honor the extraordin...
SANDERS
8
SANDERS: And then, to make a bad situation wor...
SANDERS
9
This campaign is about a political revolution ...
SANDERS
10
HOLT: Senator, thank you.
HOLT
11
(APPLAUSE)
HOLT
12
And Governor O'Malley, your opening statement,...
HOLT
13
O'MALLEY: Thank you. My name is Martin O'Malle...
O'MALLEY
14
And I want to thank the people of South Caroli...
O'MALLEY
15
You taught us, in fact, in keeping with Dr. Ki...
O'MALLEY
16
Eight years ago, you brought forward a new lea...
O'MALLEY
17
But in order to make good on the promise of eq...
O'MALLEY
18
We need new leadership. We need to come togeth...
O'MALLEY
19
That's why I'm running for president. I need y...
O'MALLEY
20
Thank you.
O'MALLEY
21
HOLT: All right. And Governor, thank you.
HOLT
22
(APPLAUSE)
HOLT
23
HOLT: All right, to our first question, now. T...
HOLT
24
President Obama came to office determined to s...
HOLT
25
Senator Sanders.
HOLT
26
SANDERS: Well, that's what our campaign is abo...
SANDERS
27
So, what my first days are about is bringing A...
SANDERS
28
(APPLAUSE)
SANDERS
29
HOLT: Secretary Clinton, same question, my fir...
HOLT
...
...
...
592
HOLT: Welcome back everybody. Finally, before ...
HOLT
593
And, we'll start with Governor O'Malley.
HOLT
594
(LAUGHTER)
HOLT
595
HOLT: Didn't see that coming, did you?
HOLT
596
O'MALLEY: Yes, but we're going to have to get ...
O'MALLEY
597
(LAUGHTER)
O'MALLEY
598
MITCHELL: ...too long (ph).
MITCHELL
599
O'MALLEY: I believe there are many issues. I h...
O'MALLEY
600
HOLT: Sixty seconds, we'd appreciate it.
HOLT
601
O'MALLEY: There are so many issues that we hav...
O'MALLEY
602
(APPLAUSE)
O'MALLEY
603
O'MALLEY: We haven't discussed the fact that i...
O'MALLEY
604
I guess the bottom line is this, look we are a...
O'MALLEY
605
We're on the threshold of a new era of America...
O'MALLEY
606
HOLT: And that's time.
HOLT
607
O'MALLEY: Thanks a lot.
O'MALLEY
608
HOLT: Secretary Clinton?
HOLT
609
CLINTON: Well Lester, I spent a lot of time la...
CLINTON
610
He had request for help and he had basically s...
CLINTON
611
So I sent my top campaign aide down there to t...
CLINTON
612
HOLT: And that's time.
HOLT
613
CLINTON: I want to be a president who takes ca...
CLINTON
614
(APPLAUSE)
CLINTON
615
HOLT: Thank you.
HOLT
616
Senator Sanders?
HOLT
617
SANDERS: Well, Secretary Clinton was right and...
SANDERS
618
Now, we are a great nation -- and we've heard ...
SANDERS
619
We've got to get rid of Super PACs, we've got ...
SANDERS
620
HOLT: All right. Well thank you and thanks to ...
HOLT
621
I also want to thank the Congressional Black C...
HOLT
622 rows × 2 columns
In [10]:
dataset.speaker.value_counts()
Out[10]:
SANDERS 168
HOLT 152
CLINTON 131
O'MALLEY 113
MITCHELL 43
TODD 7
BROWNLEE 4
FRANTA 2
MILLER 2
Name: speaker, dtype: int64
In [11]:
get_speach = lambda x: re.sub("^[A-Z']*:\s", "", x)
dataset["speach"] = dataset.raw.apply(get_speach)
dataset
Out[11]:
raw
speaker
speach
0
HOLT: We'll begin with 45 second opening state...
HOLT
We'll begin with 45 second opening statements ...
1
CLINTON: Well, good evening. And I want to tha...
CLINTON
Well, good evening. And I want to thank the Co...
2
You know, I remember well when my youth minist...
CLINTON
You know, I remember well when my youth minist...
3
And that is our fight still. We have to get th...
CLINTON
And that is our fight still. We have to get th...
4
I understand that this is the hardest job in t...
CLINTON
I understand that this is the hardest job in t...
5
(APPLAUSE)
CLINTON
(APPLAUSE)
6
HOLT: Thank you. Senator Sanders, your opening...
HOLT
Thank you. Senator Sanders, your opening state...
7
SANDERS: Thank you. As we honor the extraordin...
SANDERS
Thank you. As we honor the extraordinary life ...
8
SANDERS: And then, to make a bad situation wor...
SANDERS
And then, to make a bad situation worse, we ha...
9
This campaign is about a political revolution ...
SANDERS
This campaign is about a political revolution ...
10
HOLT: Senator, thank you.
HOLT
Senator, thank you.
11
(APPLAUSE)
HOLT
(APPLAUSE)
12
And Governor O'Malley, your opening statement,...
HOLT
And Governor O'Malley, your opening statement,...
13
O'MALLEY: Thank you. My name is Martin O'Malle...
O'MALLEY
Thank you. My name is Martin O'Malley, I was b...
14
And I want to thank the people of South Caroli...
O'MALLEY
And I want to thank the people of South Caroli...
15
You taught us, in fact, in keeping with Dr. Ki...
O'MALLEY
You taught us, in fact, in keeping with Dr. Ki...
16
Eight years ago, you brought forward a new lea...
O'MALLEY
Eight years ago, you brought forward a new lea...
17
But in order to make good on the promise of eq...
O'MALLEY
But in order to make good on the promise of eq...
18
We need new leadership. We need to come togeth...
O'MALLEY
We need new leadership. We need to come togeth...
19
That's why I'm running for president. I need y...
O'MALLEY
That's why I'm running for president. I need y...
20
Thank you.
O'MALLEY
Thank you.
21
HOLT: All right. And Governor, thank you.
HOLT
All right. And Governor, thank you.
22
(APPLAUSE)
HOLT
(APPLAUSE)
23
HOLT: All right, to our first question, now. T...
HOLT
All right, to our first question, now. The fir...
24
President Obama came to office determined to s...
HOLT
President Obama came to office determined to s...
25
Senator Sanders.
HOLT
Senator Sanders.
26
SANDERS: Well, that's what our campaign is abo...
SANDERS
Well, that's what our campaign is about. It is...
27
So, what my first days are about is bringing A...
SANDERS
So, what my first days are about is bringing A...
28
(APPLAUSE)
SANDERS
(APPLAUSE)
29
HOLT: Secretary Clinton, same question, my fir...
HOLT
Secretary Clinton, same question, my first 100...
...
...
...
...
592
HOLT: Welcome back everybody. Finally, before ...
HOLT
Welcome back everybody. Finally, before we go ...
593
And, we'll start with Governor O'Malley.
HOLT
And, we'll start with Governor O'Malley.
594
(LAUGHTER)
HOLT
(LAUGHTER)
595
HOLT: Didn't see that coming, did you?
HOLT
Didn't see that coming, did you?
596
O'MALLEY: Yes, but we're going to have to get ...
O'MALLEY
Yes, but we're going to have to get 20 minutes...
597
(LAUGHTER)
O'MALLEY
(LAUGHTER)
598
MITCHELL: ...too long (ph).
MITCHELL
...too long (ph).
599
O'MALLEY: I believe there are many issues. I h...
O'MALLEY
I believe there are many issues. I have 60 sec...
600
HOLT: Sixty seconds, we'd appreciate it.
HOLT
Sixty seconds, we'd appreciate it.
601
O'MALLEY: There are so many issues that we hav...
O'MALLEY
There are so many issues that we haven't been ...
602
(APPLAUSE)
O'MALLEY
(APPLAUSE)
603
O'MALLEY: We haven't discussed the fact that i...
O'MALLEY
We haven't discussed the fact that in our hemi...
604
I guess the bottom line is this, look we are a...
O'MALLEY
I guess the bottom line is this, look we are a...
605
We're on the threshold of a new era of America...
O'MALLEY
We're on the threshold of a new era of America...
606
HOLT: And that's time.
HOLT
And that's time.
607
O'MALLEY: Thanks a lot.
O'MALLEY
Thanks a lot.
608
HOLT: Secretary Clinton?
HOLT
Secretary Clinton?
609
CLINTON: Well Lester, I spent a lot of time la...
CLINTON
Well Lester, I spent a lot of time last week b...
610
He had request for help and he had basically s...
CLINTON
He had request for help and he had basically s...
611
So I sent my top campaign aide down there to t...
CLINTON
So I sent my top campaign aide down there to t...
612
HOLT: And that's time.
HOLT
And that's time.
613
CLINTON: I want to be a president who takes ca...
CLINTON
I want to be a president who takes care of the...
614
(APPLAUSE)
CLINTON
(APPLAUSE)
615
HOLT: Thank you.
HOLT
Thank you.
616
Senator Sanders?
HOLT
Senator Sanders?
617
SANDERS: Well, Secretary Clinton was right and...
SANDERS
Well, Secretary Clinton was right and what I d...
618
Now, we are a great nation -- and we've heard ...
SANDERS
Now, we are a great nation -- and we've heard ...
619
We've got to get rid of Super PACs, we've got ...
SANDERS
We've got to get rid of Super PACs, we've got ...
620
HOLT: All right. Well thank you and thanks to ...
HOLT
All right. Well thank you and thanks to all of...
621
I also want to thank the Congressional Black C...
HOLT
I also want to thank the Congressional Black C...
622 rows × 3 columns
In [12]:
applause_ds = dataset[dataset.speach == "(APPLAUSE)"]
len(applause_ds)
Out[12]:
34
In [13]:
applause_ds.speaker.value_counts()
Out[13]:
SANDERS 12
CLINTON 12
O'MALLEY 7
HOLT 3
Name: speaker, dtype: int64
In [14]:
applause_counts = applause_ds.speaker.value_counts().sort_values()
bottom = [index for index, item in enumerate(applause_counts.index)]
plt.barh(bottom, width=applause_counts, color="orange", linewidth=0)
y_labels = ["%s %.1f%%" % (item, 100.0*applause_counts[item]/len(applause_ds)) for index,item in enumerate(applause_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
applause_counts
Out[14]:
HOLT 3
O'MALLEY 7
SANDERS 12
CLINTON 12
Name: speaker, dtype: int64
In [15]:
word_count = lambda x: len(re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",x))
In [16]:
dataset["word_count"] = dataset.speach.apply(word_count)
dataset
Out[16]:
raw
speaker
speach
word_count
0
HOLT: We'll begin with 45 second opening state...
HOLT
We'll begin with 45 second opening statements ...
14
1
CLINTON: Well, good evening. And I want to tha...
CLINTON
Well, good evening. And I want to thank the Co...
31
2
You know, I remember well when my youth minist...
CLINTON
You know, I remember well when my youth minist...
67
3
And that is our fight still. We have to get th...
CLINTON
And that is our fight still. We have to get th...
50
4
I understand that this is the hardest job in t...
CLINTON
I understand that this is the hardest job in t...
42
5
(APPLAUSE)
CLINTON
(APPLAUSE)
1
6
HOLT: Thank you. Senator Sanders, your opening...
HOLT
Thank you. Senator Sanders, your opening state...
8
7
SANDERS: Thank you. As we honor the extraordin...
SANDERS
Thank you. As we honor the extraordinary life ...
88
8
SANDERS: And then, to make a bad situation wor...
SANDERS
And then, to make a bad situation worse, we ha...
28
9
This campaign is about a political revolution ...
SANDERS
This campaign is about a political revolution ...
18
10
HOLT: Senator, thank you.
HOLT
Senator, thank you.
3
11
(APPLAUSE)
HOLT
(APPLAUSE)
1
12
And Governor O'Malley, your opening statement,...
HOLT
And Governor O'Malley, your opening statement,...
7
13
O'MALLEY: Thank you. My name is Martin O'Malle...
O'MALLEY
Thank you. My name is Martin O'Malley, I was b...
21
14
And I want to thank the people of South Caroli...
O'MALLEY
And I want to thank the people of South Caroli...
38
15
You taught us, in fact, in keeping with Dr. Ki...
O'MALLEY
You taught us, in fact, in keeping with Dr. Ki...
37
16
Eight years ago, you brought forward a new lea...
O'MALLEY
Eight years ago, you brought forward a new lea...
34
17
But in order to make good on the promise of eq...
O'MALLEY
But in order to make good on the promise of eq...
47
18
We need new leadership. We need to come togeth...
O'MALLEY
We need new leadership. We need to come togeth...
23
19
That's why I'm running for president. I need y...
O'MALLEY
That's why I'm running for president. I need y...
26
20
Thank you.
O'MALLEY
Thank you.
2
21
HOLT: All right. And Governor, thank you.
HOLT
All right. And Governor, thank you.
6
22
(APPLAUSE)
HOLT
(APPLAUSE)
1
23
HOLT: All right, to our first question, now. T...
HOLT
All right, to our first question, now. The fir...
18
24
President Obama came to office determined to s...
HOLT
President Obama came to office determined to s...
52
25
Senator Sanders.
HOLT
Senator Sanders.
2
26
SANDERS: Well, that's what our campaign is abo...
SANDERS
Well, that's what our campaign is about. It is...
68
27
So, what my first days are about is bringing A...
SANDERS
So, what my first days are about is bringing A...
61
28
(APPLAUSE)
SANDERS
(APPLAUSE)
1
29
HOLT: Secretary Clinton, same question, my fir...
HOLT
Secretary Clinton, same question, my first 100...
16
...
...
...
...
...
592
HOLT: Welcome back everybody. Finally, before ...
HOLT
Welcome back everybody. Finally, before we go ...
48
593
And, we'll start with Governor O'Malley.
HOLT
And, we'll start with Governor O'Malley.
6
594
(LAUGHTER)
HOLT
(LAUGHTER)
1
595
HOLT: Didn't see that coming, did you?
HOLT
Didn't see that coming, did you?
6
596
O'MALLEY: Yes, but we're going to have to get ...
O'MALLEY
Yes, but we're going to have to get 20 minutes...
14
597
(LAUGHTER)
O'MALLEY
(LAUGHTER)
1
598
MITCHELL: ...too long (ph).
MITCHELL
...too long (ph).
3
599
O'MALLEY: I believe there are many issues. I h...
O'MALLEY
I believe there are many issues. I have 60 sec...
12
600
HOLT: Sixty seconds, we'd appreciate it.
HOLT
Sixty seconds, we'd appreciate it.
5
601
O'MALLEY: There are so many issues that we hav...
O'MALLEY
There are so many issues that we haven't been ...
61
602
(APPLAUSE)
O'MALLEY
(APPLAUSE)
1
603
O'MALLEY: We haven't discussed the fact that i...
O'MALLEY
We haven't discussed the fact that in our hemi...
27
604
I guess the bottom line is this, look we are a...
O'MALLEY
I guess the bottom line is this, look we are a...
80
605
We're on the threshold of a new era of America...
O'MALLEY
We're on the threshold of a new era of America...
33
606
HOLT: And that's time.
HOLT
And that's time.
3
607
O'MALLEY: Thanks a lot.
O'MALLEY
Thanks a lot.
3
608
HOLT: Secretary Clinton?
HOLT
Secretary Clinton?
2
609
CLINTON: Well Lester, I spent a lot of time la...
CLINTON
Well Lester, I spent a lot of time last week b...
72
610
He had request for help and he had basically s...
CLINTON
He had request for help and he had basically s...
38
611
So I sent my top campaign aide down there to t...
CLINTON
So I sent my top campaign aide down there to t...
59
612
HOLT: And that's time.
HOLT
And that's time.
3
613
CLINTON: I want to be a president who takes ca...
CLINTON
I want to be a president who takes care of the...
25
614
(APPLAUSE)
CLINTON
(APPLAUSE)
1
615
HOLT: Thank you.
HOLT
Thank you.
2
616
Senator Sanders?
HOLT
Senator Sanders?
2
617
SANDERS: Well, Secretary Clinton was right and...
SANDERS
Well, Secretary Clinton was right and what I d...
32
618
Now, we are a great nation -- and we've heard ...
SANDERS
Now, we are a great nation -- and we've heard ...
58
619
We've got to get rid of Super PACs, we've got ...
SANDERS
We've got to get rid of Super PACs, we've got ...
73
620
HOLT: All right. Well thank you and thanks to ...
HOLT
All right. Well thank you and thanks to all of...
28
621
I also want to thank the Congressional Black C...
HOLT
I also want to thank the Congressional Black C...
33
622 rows × 4 columns
In [17]:
words_ds = dataset[dataset.speaker.isin(["CLINTON","SANDERS","O'MALLEY"])]
In [18]:
words_counts = words_ds.pivot_table(values="word_count", index="speaker", columns=None, aggfunc='mean',).sort_values()
bottom = [index for index, item in enumerate(words_counts.index)]
plt.barh(bottom, width=words_counts, color="orange", linewidth=0)
y_labels = ["%s %.1f words/paragraph" % (item, words_counts[item]) for index,item in enumerate(words_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
words_counts
Out[18]:
speaker
O'MALLEY 25.115044
SANDERS 27.351190
CLINTON 32.656489
Name: word_count, dtype: float64
In [19]:
words_counts = words_ds.pivot_table(values="word_count", index="speaker", columns=None, aggfunc='sum',).sort_values()
bottom = [index for index, item in enumerate(words_counts.index)]
plt.barh(bottom, width=words_counts, color="orange", linewidth=0)
y_labels = ["%s %d (%.1f%%)" % (item, words_counts[item], 100.0*words_counts[item]/np.sum(words_counts)) for index,item in enumerate(words_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
words_counts
Out[19]:
speaker
O'MALLEY 2838
CLINTON 4278
SANDERS 4595
Name: word_count, dtype: int64
In [20]:
speaker_dict = {value:index for index,value in enumerate(words_ds.speaker.unique())}
speaker_dict
Out[20]:
{'CLINTON': 0, "O'MALLEY": 2, 'SANDERS': 1}
In [21]:
words_ds["speaker_no"] = words_ds.speaker.map(speaker_dict)
words_ds
-c:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[21]:
raw
speaker
speach
word_count
speaker_no
1
CLINTON: Well, good evening. And I want to tha...
CLINTON
Well, good evening. And I want to thank the Co...
31
0
2
You know, I remember well when my youth minist...
CLINTON
You know, I remember well when my youth minist...
67
0
3
And that is our fight still. We have to get th...
CLINTON
And that is our fight still. We have to get th...
50
0
4
I understand that this is the hardest job in t...
CLINTON
I understand that this is the hardest job in t...
42
0
5
(APPLAUSE)
CLINTON
(APPLAUSE)
1
0
7
SANDERS: Thank you. As we honor the extraordin...
SANDERS
Thank you. As we honor the extraordinary life ...
88
1
8
SANDERS: And then, to make a bad situation wor...
SANDERS
And then, to make a bad situation worse, we ha...
28
1
9
This campaign is about a political revolution ...
SANDERS
This campaign is about a political revolution ...
18
1
13
O'MALLEY: Thank you. My name is Martin O'Malle...
O'MALLEY
Thank you. My name is Martin O'Malley, I was b...
21
2
14
And I want to thank the people of South Caroli...
O'MALLEY
And I want to thank the people of South Caroli...
38
2
15
You taught us, in fact, in keeping with Dr. Ki...
O'MALLEY
You taught us, in fact, in keeping with Dr. Ki...
37
2
16
Eight years ago, you brought forward a new lea...
O'MALLEY
Eight years ago, you brought forward a new lea...
34
2
17
But in order to make good on the promise of eq...
O'MALLEY
But in order to make good on the promise of eq...
47
2
18
We need new leadership. We need to come togeth...
O'MALLEY
We need new leadership. We need to come togeth...
23
2
19
That's why I'm running for president. I need y...
O'MALLEY
That's why I'm running for president. I need y...
26
2
20
Thank you.
O'MALLEY
Thank you.
2
2
26
SANDERS: Well, that's what our campaign is abo...
SANDERS
Well, that's what our campaign is about. It is...
68
1
27
So, what my first days are about is bringing A...
SANDERS
So, what my first days are about is bringing A...
61
1
28
(APPLAUSE)
SANDERS
(APPLAUSE)
1
1
30
CLINTON: I would work quickly to present to th...
CLINTON
I would work quickly to present to the Congres...
35
0
31
I would also...
CLINTON
I would also...
3
0
32
(APPLAUSE)
CLINTON
(APPLAUSE)
1
0
33
I would also be presenting my plans to build o...
CLINTON
I would also be presenting my plans to build o...
67
0
34
And third, I would be working, in every way th...
CLINTON
And third, I would be working, in every way th...
78
0
35
(APPLAUSE)
CLINTON
(APPLAUSE)
1
0
37
O'MALLEY: Thank you. First of all, I would lay...
O'MALLEY
Thank you. First of all, I would lay out an ag...
77
2
38
Secondly, I believe the greatest business oppo...
O'MALLEY
Secondly, I believe the greatest business oppo...
48
2
39
(APPLAUSE)
O'MALLEY
(APPLAUSE)
1
2
41
O'MALLEY: Finally -- I'm sorry, that was secon...
O'MALLEY
Finally -- I'm sorry, that was second, Lester.
8
2
42
O'MALLEY: And third and finally, we need a new...
O'MALLEY
And third and finally, we need a new agenda fo...
77
2
...
...
...
...
...
...
573
We just have to do more of it, and we have to ...
CLINTON
We just have to do more of it, and we have to ...
32
0
576
SANDERS: Great ideas, Governor O'Malley, Secre...
SANDERS
Great ideas, Governor O'Malley, Secretary Clin...
27
1
577
So here's a promise that I make -- and I menti...
SANDERS
So here's a promise that I make -- and I menti...
43
1
578
Here's a promise. If elected president, Goldma...
SANDERS
Here's a promise. If elected president, Goldma...
24
1
579
(APPLAUSE)
SANDERS
(APPLAUSE)
1
1
581
SANDERS: I was asked a question. You know, one...
SANDERS
I was asked a question. You know, one of the t...
58
1
582
I have avoided doing that. Trying to run an is...
SANDERS
I have avoided doing that. Trying to run an is...
11
1
583
(APPLAUSE)
SANDERS
(APPLAUSE)
1
1
584
SANDERS: I was asked a question.
SANDERS
I was asked a question.
5
1
586
SANDERS: Well -- then if I don't answer it, th...
SANDERS
Well -- then if I don't answer it, then there'...
17
1
587
(LAUGHTER)
SANDERS
(LAUGHTER)
1
1
588
And I mean this seriously. You know that. We'v...
SANDERS
And I mean this seriously. You know that. We'v...
51
1
589
(APPLAUSE)
SANDERS
(APPLAUSE)
1
1
596
O'MALLEY: Yes, but we're going to have to get ...
O'MALLEY
Yes, but we're going to have to get 20 minutes...
14
2
597
(LAUGHTER)
O'MALLEY
(LAUGHTER)
1
2
599
O'MALLEY: I believe there are many issues. I h...
O'MALLEY
I believe there are many issues. I have 60 sec...
12
2
601
O'MALLEY: There are so many issues that we hav...
O'MALLEY
There are so many issues that we haven't been ...
61
2
602
(APPLAUSE)
O'MALLEY
(APPLAUSE)
1
2
603
O'MALLEY: We haven't discussed the fact that i...
O'MALLEY
We haven't discussed the fact that in our hemi...
27
2
604
I guess the bottom line is this, look we are a...
O'MALLEY
I guess the bottom line is this, look we are a...
80
2
605
We're on the threshold of a new era of America...
O'MALLEY
We're on the threshold of a new era of America...
33
2
607
O'MALLEY: Thanks a lot.
O'MALLEY
Thanks a lot.
3
2
609
CLINTON: Well Lester, I spent a lot of time la...
CLINTON
Well Lester, I spent a lot of time last week b...
72
0
610
He had request for help and he had basically s...
CLINTON
He had request for help and he had basically s...
38
0
611
So I sent my top campaign aide down there to t...
CLINTON
So I sent my top campaign aide down there to t...
59
0
613
CLINTON: I want to be a president who takes ca...
CLINTON
I want to be a president who takes care of the...
25
0
614
(APPLAUSE)
CLINTON
(APPLAUSE)
1
0
617
SANDERS: Well, Secretary Clinton was right and...
SANDERS
Well, Secretary Clinton was right and what I d...
32
1
618
Now, we are a great nation -- and we've heard ...
SANDERS
Now, we are a great nation -- and we've heard ...
58
1
619
We've got to get rid of Super PACs, we've got ...
SANDERS
We've got to get rid of Super PACs, we've got ...
73
1
412 rows × 5 columns
In [22]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(words_ds.speach)
count_matrix = count_matrix.toarray()
word_count = pd.DataFrame(cv.get_feature_names(), columns=["word"])
word_count["count"] = count_matrix.sum(axis=0)
word_count = word_count.sort_values(by="count", ascending=False).reset_index(drop=True)
word_count[:]
Out[22]:
word
count
0
the
546
1
to
414
2
and
362
3
of
293
4
we
280
5
that
274
6
in
223
7
is
157
8
have
144
9
it
126
10
on
117
11
you
104
12
what
102
13
for
102
14
people
86
15
our
85
16
with
82
17
this
78
18
do
71
19
not
69
20
but
62
21
as
61
22
are
58
23
be
55
24
can
51
25
applause
49
26
need
49
27
was
48
28
so
48
29
all
47
...
...
...
1938
leaders
1
1939
boldly
1
1940
lay
1
1941
book
1
1942
border
1
1943
larger
1
1944
land
1
1945
lake
1
1946
laid
1
1947
lady
1
1948
legally
1
1949
less
1
1950
lost
1
1951
lesson
1
1952
loop
1
1953
biosurveillance
1
1954
living
1
1955
blame
1
1956
listened
1
1957
listen
1
1958
lindsey
1
1959
limit
1
1960
block
1
1961
lifting
1
1962
liable
1
1963
liability
1
1964
level
1
1965
lethal
1
1966
bloodiest
1
1967
zero
1
1968 rows × 2 columns
In [23]:
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="SANDERS")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
Out[23]:
Vocab
Vocab_index
proba
anti_proba
difference
1064
major
1064
-9.080573
-6.454097
-2.626476
759
goldman
759
-9.080573
-6.677241
-2.403332
1507
sachs
1507
-9.080573
-6.677241
-2.403332
423
countries
423
-9.080573
-6.810772
-2.269801
406
contributions
406
-9.080573
-6.810772
-2.269801
418
corrupt
418
-9.080573
-6.810772
-2.269801
1627
spending
1627
-9.080573
-6.964923
-2.115650
381
companies
381
-9.080573
-6.964923
-2.115650
1411
real
1411
-9.080573
-6.964923
-2.115650
1852
vermont
1852
-9.080573
-6.964923
-2.115650
407
contributors
407
-9.080573
-7.147245
-1.933329
1480
revolution
1480
-9.080573
-7.147245
-1.933329
1070
man
1070
-9.080573
-7.147245
-1.933329
1481
rhetoric
1481
-9.080573
-7.147245
-1.933329
1776
transform
1776
-9.080573
-7.147245
-1.933329
1672
super
1672
-9.080573
-7.147245
-1.933329
1484
rid
1484
-9.080573
-7.147245
-1.933329
1782
treasury
1782
-9.080573
-7.147245
-1.933329
1576
she
1576
-9.080573
-7.147245
-1.933329
447
crumbling
447
-9.080573
-7.370388
-1.710185
997
latino
997
-9.080573
-7.370388
-1.710185
795
hampshire
795
-9.080573
-7.370388
-1.710185
1712
terms
1712
-8.387426
-6.677241
-1.710185
1675
supported
1675
-9.080573
-7.370388
-1.710185
1346
private
1346
-8.387426
-6.677241
-1.710185
1343
priority
1343
-9.080573
-7.370388
-1.710185
995
largest
995
-9.080573
-7.370388
-1.710185
151
area
151
-9.080573
-7.370388
-1.710185
133
anti
133
-9.080573
-7.370388
-1.710185
1238
pacs
1238
-9.080573
-7.370388
-1.710185
...
...
...
...
...
...
128
andrea
128
-6.307985
-7.658070
1.350086
1719
thank
1719
-6.682678
-8.063535
1.380857
718
frank
718
-6.682678
-8.063535
1.380857
23
30
23
-7.288814
-8.756682
1.467869
481
defend
481
-7.288814
-8.756682
1.467869
1084
matter
1084
-7.288814
-8.756682
1.467869
1799
try
1799
-7.288814
-8.756682
1.467869
809
haven
809
-7.288814
-8.756682
1.467869
918
intelligence
918
-7.288814
-8.756682
1.467869
86
age
86
-7.288814
-8.756682
1.467869
621
equal
621
-7.288814
-8.756682
1.467869
1874
visiting
1874
-7.288814
-8.756682
1.467869
156
around
156
-7.288814
-8.756682
1.467869
562
door
562
-7.288814
-8.756682
1.467869
707
forces
707
-7.288814
-8.756682
1.467869
1842
use
1842
-7.288814
-8.756682
1.467869
1957
year
1957
-6.515624
-8.063535
1.547911
972
keep
972
-7.134663
-8.756682
1.622019
273
build
273
-7.134663
-8.756682
1.622019
1634
stage
1634
-7.001132
-8.756682
1.755551
59
actually
59
-6.307985
-8.063535
1.755551
169
attacks
169
-7.001132
-8.756682
1.755551
420
costs
420
-7.001132
-8.756682
1.755551
791
had
791
-6.307985
-8.063535
1.755551
365
come
365
-7.001132
-8.756682
1.755551
1589
since
1589
-6.777988
-8.756682
1.978694
553
dodd
553
-6.777988
-8.756682
1.978694
84
again
84
-6.777988
-8.756682
1.978694
1281
plan
1281
-6.682678
-8.756682
2.074004
1021
lester
1021
-6.682678
-8.756682
2.074004
1968 rows × 5 columns
In [24]:
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="CLINTON")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
Out[24]:
Vocab
Vocab_index
proba
anti_proba
difference
553
dodd
553
-9.111183
-6.410175
-2.701008
1799
try
1799
-9.111183
-6.921001
-2.190182
718
frank
718
-8.418036
-6.314865
-2.103171
1925
white
1925
-9.111183
-7.103322
-2.007861
1287
pleased
1287
-9.111183
-7.103322
-2.007861
1608
someone
1608
-9.111183
-7.103322
-2.007861
1628
spent
1628
-9.111183
-7.103322
-2.007861
625
especially
625
-9.111183
-7.103322
-2.007861
927
introduced
927
-9.111183
-7.103322
-2.007861
1487
rights
1487
-8.418036
-6.515535
-1.902500
1516
sanctions
1516
-9.111183
-7.326466
-1.784717
1349
problem
1349
-9.111183
-7.326466
-1.784717
895
incomes
895
-9.111183
-7.326466
-1.784717
650
experience
650
-9.111183
-7.326466
-1.784717
370
comments
370
-9.111183
-7.326466
-1.784717
1912
week
1912
-9.111183
-7.326466
-1.784717
679
fighters
679
-9.111183
-7.326466
-1.784717
147
approach
147
-9.111183
-7.326466
-1.784717
1544
sector
1544
-9.111183
-7.326466
-1.784717
1563
serious
1563
-9.111183
-7.326466
-1.784717
16
2011
16
-9.111183
-7.326466
-1.784717
791
had
791
-7.724888
-6.073703
-1.651186
420
costs
420
-8.418036
-6.766850
-1.651186
1877
voted
1877
-8.012571
-6.515535
-1.497035
1639
standing
1639
-9.111183
-7.614148
-1.497035
1786
treaty
1786
-9.111183
-7.614148
-1.497035
55
acted
55
-9.111183
-7.614148
-1.497035
699
flint
699
-9.111183
-7.614148
-1.497035
1878
votes
1878
-9.111183
-7.614148
-1.497035
1041
lone
1041
-9.111183
-7.614148
-1.497035
...
...
...
...
...
...
1274
ph
1274
-7.319423
-8.712760
1.393337
1371
provide
1371
-7.319423
-8.712760
1.393337
1852
vermont
1852
-7.319423
-8.712760
1.393337
7
15
7
-7.319423
-8.712760
1.393337
809
haven
809
-7.319423
-8.712760
1.393337
23
30
23
-7.319423
-8.712760
1.393337
1935
without
1935
-7.319423
-8.712760
1.393337
711
forward
711
-6.546233
-8.019613
1.473379
1540
seconds
1540
-7.165273
-8.712760
1.547487
632
ever
632
-7.165273
-8.712760
1.547487
729
front
729
-7.165273
-8.712760
1.547487
271
budget
271
-7.165273
-8.712760
1.547487
1792
true
1792
-7.165273
-8.712760
1.547487
418
corrupt
418
-7.165273
-8.712760
1.547487
1271
person
1271
-7.165273
-8.712760
1.547487
1267
percent
1267
-7.165273
-8.712760
1.547487
406
contributions
406
-7.165273
-8.712760
1.547487
561
done
561
-7.165273
-8.712760
1.547487
952
issues
952
-7.165273
-8.712760
1.547487
423
countries
423
-7.165273
-8.712760
1.547487
1537
second
1537
-7.031741
-8.712760
1.681019
1542
secretary
1542
-5.645447
-7.326466
1.681019
1507
sachs
1507
-7.031741
-8.712760
1.681019
759
goldman
759
-7.031741
-8.712760
1.681019
128
andrea
128
-6.277969
-8.019613
1.741643
1731
things
1731
-6.220811
-8.019613
1.798802
1712
terms
1712
-6.913958
-8.712760
1.798802
1064
major
1064
-6.808598
-8.712760
1.904162
1728
these
1728
-6.472125
-8.712760
2.240635
350
clinton
350
-6.066660
-8.712760
2.646100
1968 rows × 5 columns
In [25]:
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="O'MALLY")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
Out[25]:
Vocab
Vocab_index
proba
anti_proba
difference
983
knew
983
-8.792398
-7.584773
-1.207625
1129
momentum
1129
-8.792398
-7.584773
-1.207625
1127
mom
1127
-8.792398
-7.584773
-1.207625
1124
mixed
1124
-8.792398
-7.584773
-1.207625
1122
mission
1122
-8.792398
-7.584773
-1.207625
1121
missing
1121
-8.792398
-7.584773
-1.207625
1120
minutes
1120
-8.792398
-7.584773
-1.207625
1119
minus
1119
-8.792398
-7.584773
-1.207625
1131
months
1131
-8.792398
-7.584773
-1.207625
1116
mindful
1116
-8.792398
-7.584773
-1.207625
1111
militarize
1111
-8.792398
-7.584773
-1.207625
1109
midst
1109
-8.792398
-7.584773
-1.207625
1107
michigan
1107
-8.792398
-7.584773
-1.207625
1105
message
1105
-8.792398
-7.584773
-1.207625
1103
mention
1103
-8.792398
-7.584773
-1.207625
1102
mentally
1102
-8.792398
-7.584773
-1.207625
1099
memphis
1099
-8.792398
-7.584773
-1.207625
1114
millionaires
1114
-8.792398
-7.584773
-1.207625
1132
moon
1132
-8.792398
-7.584773
-1.207625
1133
moral
1133
-8.792398
-7.584773
-1.207625
1140
moving
1140
-8.792398
-7.584773
-1.207625
1173
nothing
1173
-8.792398
-7.584773
-1.207625
1171
normalize
1171
-8.792398
-7.584773
-1.207625
1170
normalization
1170
-8.792398
-7.584773
-1.207625
1169
nonsense
1169
-8.792398
-7.584773
-1.207625
1165
nightmare
1165
-8.792398
-7.584773
-1.207625
1164
nickel
1164
-8.792398
-7.584773
-1.207625
1160
neither
1160
-8.792398
-7.584773
-1.207625
1159
neighbors
1159
-8.792398
-7.584773
-1.207625
1158
negotiating
1158
-8.792398
-7.584773
-1.207625
...
...
...
...
...
...
103
all
103
-5.614344
-7.584773
1.970429
1602
so
1602
-5.593725
-7.584773
1.991048
1897
was
1897
-5.593725
-7.584773
1.991048
144
applause
144
-5.573522
-7.584773
2.011251
1154
need
1154
-5.573522
-7.584773
2.011251
293
can
293
-5.534301
-7.584773
2.050472
202
be
202
-5.460193
-7.584773
2.124580
150
are
150
-5.408007
-7.584773
2.176766
158
as
158
-5.358411
-7.584773
2.226363
284
but
284
-5.342410
-7.584773
2.242363
1172
not
1172
-5.237050
-7.584773
2.347723
551
do
551
-5.208879
-7.584773
2.375894
1737
this
1737
-5.116097
-7.584773
2.468676
1933
with
1933
-5.066704
-7.584773
2.518069
1223
our
1223
-5.031198
-7.584773
2.553575
1265
people
1265
-5.019637
-7.584773
2.565136
705
for
705
-4.850816
-7.584773
2.733957
1917
what
1917
-4.850816
-7.584773
2.733957
1961
you
1961
-4.831585
-7.584773
2.753189
1198
on
1198
-4.714860
-7.584773
2.869913
953
it
953
-4.641358
-7.584773
2.943415
808
have
808
-4.508811
-7.584773
3.075962
943
is
943
-4.422950
-7.584773
3.161823
886
in
886
-4.073899
-7.584773
3.510874
1721
that
1721
-3.868774
-7.584773
3.715999
1903
we
1903
-3.847190
-7.584773
3.737583
1186
of
1186
-3.801965
-7.584773
3.782808
127
and
127
-3.591142
-7.584773
3.993631
1751
to
1751
-3.457266
-7.584773
4.127507
1722
the
1722
-3.181096
-7.584773
4.403677
1968 rows × 5 columns
In [ ]:
Content source: TwistedHardware/mltutorial
Similar notebooks: